In [ ]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from hanspell import spell_checker #한글 맞춤법 검사기 (python 2.7) #https://github.com/ssut/py-hanspell
In [ ]:
wadiz_df = pd.DataFrame(columns=["project_id", "title", "area", "category", "target", "result", "duration", "comment_all", "comment_user",
"comment_provider", "money_supporter", "sign_supporter"])
project_money_all = pd.DataFrame()
for page in range(1, 100):
try:
project_id = page
response = requests.get("http://www.wadiz.kr/web/campaign/detail/{page_num}".format(page_num=project_id))
#print(project_id)
dom = BeautifulSoup(response.content, "html.parser")
title_1 = dom.select("div.wd-ui-title-wrap h1.wd-h1")
title = title_1[0].text
area_1 = dom.select("div.wd-ui-campaign-info li.wd-data-area")
area = area_1[0].text
category_1 = dom.select("div.wd-ui-campaign-info li.wd-data-tag")
category = category_1[0].text
target_1 = dom.select("div.wd-info-target em.wd-data-target")
target = target_1[0].text
result_1 = dom.select("div.wd-ui-target-old span.wd-data-collection")
result = result_1[0].find("em").text
comment_all_1 = dom.select_one("div.wd-ui-tab-wrap")
comment_all = comment_all_1.find_all("li")[1].text[18:-3]
comment_provider_1 = dom.select("ul.wd-list-reply")
comment_provider = len(comment_provider_1)
comment_user = int(comment_all) - comment_provider
number_join_all = dom.select_one("li.wd-last").text[17:-6]
number_money_1 = dom.select("li.wd-data-money")
number_money = number_money_1[0].text[6:-1][:-1]
number_supporter_1 = dom.select("li.wd-data-sign")
number_supporter = number_supporter_1[0].text[6:-1][:-1]
duration_1 = dom.select("li.wd-data-date")
duration = duration_1[0].text[-23:]
#print(title, area, category, target, result)
wadiz_df.loc[len(wadiz_df)] = [project_id, title, area, category, target, result, duration, comment_all,
comment_user, comment_provider, number_money, number_supporter]
p_id = page
response_1 = requests.get("http://www.wadiz.kr/web/campaign/detailBacker/{project_num}".format(project_num = p_id))
dom_1 = BeautifulSoup(response_1.content, "html.parser")
dom_1.select("span.wd-data-sponsor")
a = pd.Series(dom_1.select("span.wd-data-sponsor strong")[1::2])
b = a.apply(lambda x: x.text[97:-93])
b = b.apply(lambda x: x.replace(",", ""))
time = dom_1.select("span.wd-data-sponsor script")
date = pd.Series()
for i in time:
date_1 = i.text[67:-53]
date.loc[len(date)] = date_1
date = date.apply(lambda x: x[0:10])
p = pd.DataFrame(columns=["project_id"])
project_money = pd.concat([p, b, date], axis=1).fillna(p_id)
project_money = project_money.rename(columns={0 : "funding_money", 1: "funding_date"})
project_money = project_money.loc[project_money["funding_money"] != ""]
project_money_all = project_money_all.append(project_money)
#print(p_id)
except:
continue
#project_money_all = project_money_all[project_money_all['funding_money'] != ""]
project_money_all.index = np.arange(len(project_money_all))
In [ ]:
# Data 저장
wadiz_df.to_csv('wadiz_df_0329.csv', encoding='utf-8')
project_money_all.to_csv('project_money_all_0329.csv')
In [ ]:
# 최종 금액 0원 초과만 재분류
wadiz_df = wadiz_df[wadiz_df["result"] > 0]
In [ ]:
# 날짜 처리
date = np.vstack(wadiz_df.duration.astype(str).apply(lambda x: list(map(str, x.split('-')))).values)
wadiz_df["date_start"] = date[:,0]
wadiz_df["date_end"] = date[:,1]
wadiz_df.drop("duration", axis=1, inplace=True)
In [ ]:
# 날짜 이상치 처리
wadiz_df = wadiz_df[wadiz_df['date_start'] != '\t\t\t\t\t\t\t\t\t\t ']
In [ ]:
wadiz_df["date_start"] = pd.to_datetime(wadiz_df["date_start"])
wadiz_df["date_end"] = pd.to_datetime(wadiz_df["date_end"])
In [ ]:
# 펀딩 기간 추가 (date_duration)
wadiz_df["date_duration"] = wadiz_df["date_end"] - wadiz_df["date_start"]
In [ ]:
wadiz_df.head()
In [ ]:
# year, month 뽑기
wadiz_df['year'] = wadiz_df['date_start'].apply(lambda x : x.year)
wadiz_df['month'] = wadiz_df['date_start'].apply(lambda x: x.month)
In [ ]:
# 한글-> 영문처리
wadiz_df["area"][wadiz_df["area"] == u'서울특별시'] = 'seoul'
wadiz_df["area"][wadiz_df["area"] == u'경기도'] = 'kyungki'
wadiz_df["area"][wadiz_df["area"] == u'부산광역시'] = 'busan'
wadiz_df["area"][wadiz_df["area"] == u'인천광역시'] = 'incheon'
wadiz_df["area"][wadiz_df["area"] == u'경상북도'] = 'kyungbuk'
wadiz_df["area"][wadiz_df["area"] == u'전라북도'] = 'jeonbuk'
wadiz_df["area"][wadiz_df["area"] == u'강원도'] = 'kangwon'
wadiz_df["area"][wadiz_df["area"] == u'대구광역시'] = 'deagu'
wadiz_df["area"][wadiz_df["area"] == u'충청남도'] = 'chungnam'
wadiz_df["area"][wadiz_df["area"] == u'충청북도'] = 'chungbuk'
wadiz_df["area"][wadiz_df["area"] == u'대전광역시'] = 'deajeon'
wadiz_df["area"][wadiz_df["area"] == u'광주광역시'] = 'gwangju'
wadiz_df["area"][wadiz_df["area"] == u'경상남도'] = 'kyungnam'
wadiz_df["area"][wadiz_df["area"] == u'제주특별자치도'] = 'jeju'
wadiz_df["area"][wadiz_df["area"] == u'울산광역시'] = 'ulsan'
wadiz_df["area"][wadiz_df["area"] == u'전라남도'] = 'jeonnam'
wadiz_df["area"][wadiz_df["area"] == u'세종특별자치시'] = 'sejong'
wadiz_df["category"][wadiz_df["category"] == u"나눔/공익"] = 'share/public'
wadiz_df["category"][wadiz_df["category"] == u"라이프/패션"] = 'life/fashion'
wadiz_df["category"][wadiz_df["category"] == u"테크/디자인"] = 'tech/design'
wadiz_df["category"][wadiz_df["category"] == u"교육"] = 'education'
wadiz_df["category"][wadiz_df["category"] == u"책/영화"] = 'book/movie'
wadiz_df["category"][wadiz_df["category"] == u"음악/공연"] = 'music/concert'
wadiz_df["category"][wadiz_df["category"] == u"미술/사진/전시"] = 'art/photo/exhibit'
wadiz_df["category"][wadiz_df["category"] == u"환경"] = 'environment'
wadiz_df["category"][wadiz_df["category"] == u"스포츠"] = 'sports'
wadiz_df["category"][wadiz_df["category"] == u"여행"] = 'travel'
wadiz_df["category"][wadiz_df["category"] == u"게임/만화"] = 'game/comics'
wadiz_df["category"][wadiz_df["category"] == u"피규어/웹툰"] = 'figure/webtoon'
In [ ]:
# category 이상치 처리
wadiz_df["category"].fillna('etc', inplace = True)
In [ ]:
# area, category -> LabelEncoding
le = LabelEncoder()
wadiz_df["category_label"] = le.fit_transform(wadiz_df["category"])
wadiz_df["area_label"] = le.fit_transform(wadiz_df["area"])
In [ ]:
# area, category -> OneHotEncoding
category_dummy = pd.get_dummies(wadiz_df['category'], prefix = 'category_label')
area_dummy = pd.get_dummies(wadiz_df['area'], prefix = 'category_label')
month = pd.get_dummies(wadiz_df.month, prefix="month")
year = pd.get_dummies(wadiz_df.year, prefix="year")
wadiz_df = pd.concat([wadiz_df, category_dummy, area_dummy, year, month], axis=1)
In [ ]:
# 콤마 제거
wadiz_df['result'] = wadiz_df['result'].apply(lambda x: x.replace(",", ""))
wadiz_df['target'] = wadiz_df['target'].apply(lambda x: x.replace(",", ""))
In [ ]:
# int 변환
wadiz_df['result'] = wadiz_df['result'].apply(lambda x : int(x))
wadiz_df['target'] = wadiz_df['target'].apply(lambda x : int(x))
In [ ]:
# funding_rate 생성
# Success/Fail 나누기
wadiz_df["funding_rate"] = wadiz_df["result"] / wadiz_df["target"]
wadiz_df["success"] = wadiz_df["result"] / wadiz_df["target"]
wadiz_df["success"][wadiz_df['funding_rate']>=1] = 1
wadiz_df["success"][wadiz_df['funding_rate']<1] = 0
In [ ]:
wadiz_df['project_id'] = wadiz_df['project_id'].apply(lambda x: int(x))
In [ ]:
# 날짜 계산용 DataFrame 생성
date_difference = pd.merge(project_money_all, wadiz_df, on="project_id")
In [ ]:
# funding_date 처리
project_money_all["funding_date"] = pd.to_datetime(project_money_all["funding_date"])
date_difference["funding_date"] = pd.to_datetime(date_difference["funding_date"])
date_difference["date_start"] = pd.to_datetime(date_difference["date_start"])
In [ ]:
date_difference["funding_date"] - date_difference["date_start"]
In [ ]:
# 프로젝트 개설일과 개인별 펀딩일 차이
project_money_all["date_difference"] = date_difference["funding_date"] - date_difference["date_start"]
# NaN값 제거 (이상치)
project_money_all['date_difference'] = project_money_all['date_difference'].fillna('-1')
project_money_all = project_money_all[project_money_all['date_difference'] >= '0 days']
In [ ]:
# 날짜 처리
project_money_all["date_difference"] = project_money_all["date_difference"].apply(lambda x: int(x)/8.640000e+13)
#project_money_all = project_money_all[project_money_all["date_difference"] >= 0]
In [ ]:
project_money_all
In [ ]:
type(project_money_all['date_difference'][0])
In [ ]:
project_money_all["0day_difference"] = np.ones(len(project_money_all))
In [ ]:
# 0~5일 이내 funding된 금액만 처리
for i in np.arange(6):
number = i
project_money_all["{number}day_difference".format(number = i)] = np.ones(len(project_money_all))
project_money_all["{number}day_difference".format(number = i)][project_money_all["date_difference"] <= number] = "short"
project_money_all["{number}day_difference".format(number = i)][project_money_all["date_difference"] > number] = "long"
In [ ]:
project_money_all['funding_money'] = project_money_all['funding_money'].apply(lambda x: int(x))
In [ ]:
zero_day = project_money_all.loc[project_money_all["0day_difference"] == "short"]
one_day = project_money_all.loc[project_money_all["1day_difference"] == "short"]
two_day = project_money_all.loc[project_money_all["2day_difference"] == "short"]
three_day = project_money_all.loc[project_money_all["3day_difference"] == "short"]
four_day = project_money_all.loc[project_money_all["4day_difference"] == "short"]
five_day = project_money_all.loc[project_money_all["5day_difference"] == "short"]
In [ ]:
zero_day = zero_day.groupby("project_id", as_index=False).sum()
one_day = one_day.groupby("project_id", as_index=False).sum()
two_day = two_day.groupby("project_id", as_index=False).sum()
three_day = three_day.groupby("project_id", as_index=False).sum()
four_day = four_day.groupby("project_id", as_index=False).sum()
five_day = five_day.groupby("project_id", as_index=False).sum()
In [ ]:
zero_day = zero_day.rename(columns={"funding_money" : "0day_funding_money"})
one_day = one_day.rename(columns={"funding_money" : "1day_funding_money"})
two_day = two_day.rename(columns={"funding_money" : "2day_funding_money"})
three_day = three_day.rename(columns={"funding_money" : "3day_funding_money"})
four_day = four_day.rename(columns={"funding_money" : "4day_funding_money"})
five_day = five_day.rename(columns={"funding_money" : "5day_funding_money"})
In [ ]:
zero_day = zero_day.rename(columns={"date_difference" : "0day_date"})
one_day = one_day.rename(columns={"date_difference" : "1day_date"})
two_day = two_day.rename(columns={"date_difference" : "2day_date"})
three_day = three_day.rename(columns={"date_difference" : "3day_date"})
four_day = four_day.rename(columns={"date_difference" : "4day_date"})
five_day = five_day.rename(columns={"date_difference" : "5day_date"})
In [ ]:
wadiz_df = pd.merge(wadiz_df, zero_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, one_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, two_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, three_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, four_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, five_day, on = "project_id", how='outer')
In [ ]:
# NaN 값 체크
# Nan값은 0~5일내에 펀딩된 금액이 없는 것을 뜻함
for i in wadiz_df.columns:
column = i
print(len(wadiz_df.loc[wadiz_df["{column}".format(column = i)].isnull() == True]))
In [ ]:
wadiz_df.fillna(0, inplace=True)
In [ ]:
#NaN 값 다시 체크
for i in wadiz_df.columns:
column = i
print(len(wadiz_df.loc[wadiz_df["{column}".format(column = i)].isnull() == True]))
In [ ]:
#funding_rate 생성
for i in np.arange(6):
number = i
wadiz_df["{number}day_funding_rate".format(number = i)] = \
wadiz_df["{number}day_funding_money".format(number = i)]/wadiz_df["target"]
In [ ]:
# funding_rate 1 이상인 값들 체크
for i in np.arange(6):
number = i
print(len(wadiz_df.loc[wadiz_df["{number}day_funding_rate".format(number = i)] >= 1]))
In [ ]:
# funding_rate -> log scale
for i in np.arange(6):
number = i
wadiz_df["{number}day_log_funding_rate".format(number = i)] = wadiz_df["{number}day_funding_rate"\
.format(number = i)].apply(lambda x: np.log(x))
In [ ]:
wadiz_df.to_csv('wadiz_df_0329_1.csv', encoding='utf-8')
In [ ]:
project_id = wadiz_df.project_id
user_data = pd.DataFrame(columns=['project_id', 'user_id', 'comment', 'date'])
user_data_all = pd.DataFrame()
provider_data = pd.DataFrame(columns=['project_id', 'provider_id', 'comment', 'date'])
provider_data_all = pd.DataFrame()
for i in project_id[0:]:
project_id_list = i
response = requests.get('https://www.wadiz.kr/web/campaign/detail/qa/{project_id_list}'.format(project_id_list = i))
dom = BeautifulSoup(response.content, 'html.parser')
user_all = dom.select('div.wd-ui-recommend li.')
print(project_id_list)
if len(user_all) == 0:
pass
else:
for number in np.arange(len(user_all)):
user = user_all[number]
user_url = user.select_one('a.wd-data-name').get('href')
user_comment = user.select_one('span').text
try:
user_date = user.select_one('span.wd-data-whenCreated').text
except:
continue
user_data.loc[len(user_data)] = [project_id_list, user_url, user_comment, user_date]
provider_all = dom.select('ul.wd-list-reply')
#print(project_id_list)
for number in np.arange(len(provider_all)):
provider = provider_all[number]
provider_url = provider.select_one('a.wd-data-name').get('href')
provider_comment = provider.select('span')[-2].text
provider_date = provider.select('span')[-1].text
provider_data.loc[len(provider_data)] = [project_id_list, provider_url, provider_comment, provider_date]
user_data_all = user_data_all.append(user_data)
provider_data_all = provider_data_all.append(provider_data)
In [ ]:
user_data_all.to_csv('user_data_all_0329.csv', encoding='utf-8')
provider_data_all.to_csv('provider_data_all_0329.csv', encoding='utf-8')
In [ ]:
comment_analysis = pd.DataFrame(columns={'project_id', 'provider_id',
'result', 'original', 'checked', 'words', 'time', 'comment_length'})
for i in np.arange(len(provider_data_all)):
try:
result = spell_checker.check(provider_data_all['comment'][i])
comment = pd.DataFrame(provider_data_all.loc[i]).T
comment_result = pd.DataFrame([result])
comment_result.index = comment.index
comment_result_df = comment.join(comment_result)
comment_analysis = comment_analysis.append(comment_result_df)
comment_analysis['comment_length'][i] = len(comment_analysis['words'][i])
if i in 100*np.arange(220):
print(i)
except:
continue
In [ ]:
# comment_error 생성
comment_error = pd.DataFrame([comment_analysis.project_id, comment_analysis.errors,
comment_analysis.provider_id, comment_analysis.comment_length]).T
# data int타입으로 전환
comment_error['errors'] = comment_error['errors'].apply(lambda x: int(x))
comment_error['comment_length'] = comment_error['comment_length'].apply(lambda x: int(x))
In [ ]:
# comment error 처리
comment_error['errors'] = comment_error['errors'].apply(lambda x: int(x))
comment_error['comment_length'] = comment_error['comment_length'].apply(lambda x: int(x))
In [ ]:
# id로 groupby
comment_error = comment_error.groupby(by='project_id', as_index=False).sum()
In [ ]:
# grammar_level 생성
# 각 댓글에 속한 error를 전체 어절로 나눔
comment_error['provider_grammar_level'] = comment_error['errors']/comment_error['comment_length']
In [ ]:
comment_analysis.to_csv('comment_analysis.csv', encoding='utf-8')
In [ ]:
wadiz_provider_analysis = pd.merge(wadiz_df, comment_error, how='inner', on= 'project_id')
In [ ]:
wadiz_provider_analysis.to_csv('wadiz_provider_analysis_0329.csv', encoding='utf-8')
In [ ]:
wadiz_df.head()
In [ ]:
wadiz_provider_analysis
In [ ]:
project_money_all
In [ ]:
provider_data_all
In [ ]:
user_data_all
In [ ]: